#!/usr/bin/env bash

# Print the node power values from nodes with an Nvidia GPU.
# The ClusterShell (clush) tool is required.
# An SSH command executes the nvidia-smi tool on the nodes and extracts the power values.

# Command usage:
function usage()
{
cat <<EOF
Usage: $0 < -w node-list | -p partition(s) | -a | -h > -s
where:
	-w node-list: Print this node-list
	-p partition(s): Print this partition
	-a: All nodes in the cluster
	-s: Summarize total power only
	-h: Print help information
	Default node-list: All nodes with a GRES value of "gpu"
EOF
}

export nodelist=""
export summarize=0
export TMPFILE=`mktemp`

while getopts "p:w:ash" options; do
	case $options in
		p ) 	export nodelist="`sinfo -h -p $OPTARG -O NodeList:`"
			if [[ -z $nodelist ]]
			then
				echo "ERROR: Partition $OPTARG does not exist on this cluster"
				usage
				exit 1
			fi
			;;
		a ) 	export nodelist="`sinfo -h --all -O NodeList:`"
			;;
		w )	export nodelist=$OPTARG ;;
		s )	export summarize=1 ;;
		h|? ) 	usage
			exit 0 ;;
	esac
done

# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
        echo ERROR: Too many command line arguments: $*
        usage
        exit 1
fi

# Default nodelist: nodes with a GRES value of "gpu"
if [[ -z "$nodelist" ]]
then
	export nodelist=`sinfo -O Nodelist,GRES | grep -i gpu | awk '{print $1}'`
fi

# Sanity check of nodelist
if [[ -z "`sinfo -h -N -n $nodelist -O NodeList:`" ]]
then
	echo "ERROR: Node-list $nodelist does not exist on this cluster"
	usage
	exit 1
fi

if [[ ! `which clush` ]]
then
	echo "The ClusterShell tool clush is missing"
	exit 1
fi

clush -w $nodelist 'which nvidia-smi 2>/dev/null > /dev/null && nvidia-smi | grep "W " | sed /W/s/// | awk "{power+=\$5}END{print power}"' | sort > $TMPFILE

if [[ ! -s $TMPFILE ]]
then
	echo "ERROR: The nvidia-smi output file from nodes $nodelist is empty"
	exit 1
fi

if [[ $summarize == 0 ]]
then
	cat $TMPFILE | awk 'BEGIN{print "Node GPU power (Watt)"}{print $0; total+=$2}END{print "TOTAL: " total}'
else
	cat $TMPFILE | awk '{total+=$2; nodes++}END{printf("TOTAL node GPU power for %d nodes= %d Watt, average= %d Watt\n", nodes, total, total/nodes)}'
fi

rm -f $TMPFILE
